## packages: remove or add your necessary packages
# required_packages <- c("tidyverse", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext")
required_packages <- c("tidyverse", "here", "colorspace", "pdftools", "kableExtra", "ggrepel")
for(i in required_packages) {
if(!require(i, character.only = T)) {
# if package is not existing, install then load the package
install.packages(i, dependencies = T)
require(i, character.only = T)
}
}
## save plots?
save <- FALSE
#save <- FALSE
## quality of png's
dpi <- 750
## theme updates; please adjust to client´s website
base_size = 10
my_theme <- function(){
theme_minimal() +
theme(
plot.margin = margin(30, 30, 30, 30),
plot.background = element_rect(color = "white",
fill = "white"),
plot.title = element_text(size = 1.4 * base_size),
plot.title.position = "plot",
plot.subtitle = element_text(size = base_size),
plot.caption = element_text(color = "grey40",
size = 0.9 * base_size),
plot.caption.position = "plot",
axis.line.x = element_line(color = "black",
size = .8),
axis.line.y = element_line(color = "black",
size = .8),
axis.title.x = element_text(size = base_size,
face = "bold"),
axis.title.y = element_text(size = base_size,
face = "bold"),
axis.text = element_text(size = 0.9 * base_size,
color = "black",
face = "bold"),
# axis.ticks = element_blank(),
axis.ticks = element_line(color = "black"),
panel.grid.major.x = element_line(size = .4,
color = "#eaeaea",
linetype = "solid"),
panel.grid.major.y = element_line(size = .4,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.x = element_line(size = .4,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(2, "lines"),
panel.spacing.y = unit(1, "lines")
)
}
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main colors rankings.io
r_col <- "#D21D5C"results <- rio::import(here::here("proc_data", "proc_data.rds")) %>%
mutate(
City = forcats::as_factor(City),
date = lubridate::ymd(date)
)glimpse(results)## Rows: 1,888
## Columns: 5
## $ Sample_ID <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ City <fct> San Antonio, San Antonio, San Antonio, San Antonio,…
## $ Initial_Ahref_rank <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, …
## $ date <date> 2021-03-05, 2021-03-06, 2021-03-07, 2021-03-08, 20…
## $ rank <int> 7, 11, 9, 10, 10, 12, 9, 7, 7, 6, 12, 15, 15, 14, 1…
results %>%
ggplot() +
geom_line(aes(x=date, y=rank, group=Sample_ID)) +
# geom_smooth(aes(x=date, y=rank)) +
geom_line(data = results %>% group_by(date) %>% summarize(avg = mean(rank, na.rm = TRUE)),
aes(x=date, y=avg),
color = r_col,
size = 2) +
my_theme() +
scale_y_reverse(breaks = c(1,5,10,15,20,25,30)) +
scale_x_date(
breaks = c(min(results$date), "2021-03-12", "2021-03-12", "2021-03-19",
"2021-03-26", "2021-04-02", max(results$date)),
date_labels = "%d %b"
# date_breaks = "1 week",
# date_minor_breaks = "1 day"
) +
labs(
title = "Ranking Over Time" ,
subtitle = "Ranking of all samples over time and their average in pink."
)if(save == T){
ggsave(here::here("plots", "all_over_time.pdf"), name_plot,
width = 12.5, height = 8, device = cairo_pdf)
}# average rank of first five days:
avg_start_1w <- results %>% filter(date < "2021-03-12") %>%
# first the first week average by sample:
group_by(Sample_ID) %>%
summarize(avg = mean(rank, na.rm = TRUE)) %>%
# then the average of the samples:
ungroup() %>%
summarize(avg = mean(avg)) %>%
as.numeric()
# average rank of last five days:
avg_end_1w <- results %>% filter(date > "2021-03-29") %>%
# first the first week average by sample:
group_by(Sample_ID) %>%
summarize(avg = mean(rank, na.rm = TRUE)) %>%
# then the average of the samples:
ungroup() %>%
summarize(avg = mean(avg)) %>%
as.numeric()
rank_gain = avg_start_1w - avg_end_1wOn average, the law firms gained 2 ranks over the month (average of first week - average of last week).
all_avg_start_1w <- results %>%
filter(date < "2021-03-12") %>%
group_by(Sample_ID) %>%
summarise(`First Week` = mean(rank, na.rm = TRUE))
all_avg_end_1w <- results %>%
filter(date > "2021-03-29") %>%
group_by(Sample_ID) %>%
summarise(`Last Week` = mean(rank, na.rm = TRUE))
all_avg_wide = all_avg_start_1w %>%
left_join(all_avg_end_1w, by = "Sample_ID") %>%
mutate(
start_page = case_when(
`First Week` <= 10 ~ 1,
`First Week` > 10 &`First Week` <= 20 ~ 2,
`First Week` > 20 ~ 3
)
)
all_avg_long <- all_avg_wide %>%
pivot_longer(cols = c(2:3), names_to = "Week", values_to = "Average Rank")
# a slopegraph
ggplot(data = all_avg_long) +
geom_line(aes(x = Week, y = `Average Rank`, group = Sample_ID)) +
my_theme() +
scale_y_reverse(breaks = c(1,5,10,15,20,25,30)) +
labs(
title = "Average Rank in First and Last Week",
subtitle = "Pink: overall average\nGreen: average of those who started on first page\nOrange: average of those who started on second page\nRed: average of those who started on third (and fourth) page"
) +
# average:
geom_line(
data = all_avg_long %>% group_by(Week) %>% summarise(avg = mean(`Average Rank`)),
aes(x = Week, y = avg, group = 1),
color = r_col,
size = 1
) + geom_label_repel(
data = all_avg_long %>% group_by(Week) %>% summarise(avg = mean(`Average Rank`)),
aes(x = Week, y = avg, label = as.character(round(avg,1))),
size = 3, color = r_col
) +
# average of those started on first page:
geom_line(
data = all_avg_long %>% filter(start_page == 1) %>% group_by(Week) %>%
summarise(avg = mean(`Average Rank`)),
aes(x = Week, y = avg, group = 1),
color = "green",
size = 1
) + geom_label_repel(
data = all_avg_long %>% filter(start_page == 1) %>% group_by(Week) %>%
summarise(avg = mean(`Average Rank`)),
aes(x = Week, y = avg, label = as.character(round(avg,1))),
size = 3, color = "green"
) +
# average of those started on 2nd page:
geom_line(
data = all_avg_long %>% filter(start_page == 2) %>% group_by(Week) %>%
summarise(avg = mean(`Average Rank`)),
aes(x = Week, y = avg, group = 1),
color = "orange",
size = 1
) + geom_label_repel(
data = all_avg_long %>% filter(start_page == 2) %>% group_by(Week) %>%
summarise(avg = mean(`Average Rank`)),
aes(x = Week, y = avg, label = as.character(round(avg,1))),
size = 3, color = "orange"
) +
# average of those started on 3rd (and 4th) page:
geom_line(
data = all_avg_long %>% filter(start_page == 3) %>% group_by(Week) %>%
summarise(avg = mean(`Average Rank`)),
aes(x = Week, y = avg, group = 1),
color = "red",
size = 1
) + geom_label_repel(
data = all_avg_long %>% filter(start_page == 3) %>% group_by(Week) %>%
summarise(avg = mean(`Average Rank`)),
aes(x = Week, y = avg, label = as.character(round(avg,1))),
size = 3, color = "red"
)A paired t-test comparing the average ranking of each law firm during the first week and last week shows that the difference is statistically significant. We reject the null hypothesis that the difference in mean between the first week and last week in zero in favor of the alternative hypothesis: true difference in means is not equal to 0.
t.test(all_avg_wide$`First Week`,
all_avg_wide$`Last Week`,
paired=TRUE)##
## Paired t-test
##
## data: all_avg_wide$`First Week` and all_avg_wide$`Last Week`
## t = 4.7894, df = 58, p-value = 1.198e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.166379 2.841450
## sample estimates:
## mean of the differences
## 2.003914